Demo: Exploring clinical data from multiple commons

Fan Wang

July 15 2020

Data Source:

Project Data description Subjects Source
open-DSFSI (Kaggle) Epidemiology individual-level line lists in Africa 10087 Africa CDC and Ministries of Health
open-nCoV2019 (UW) Epidemiology individual-level line lists around the world 1083 University of Washington Institute for Health Metrics and Evaluation
TCGA-COAD (GDC) Clinical data for TCGA Colon Adenocarcinoma samples 461 NCI Genomic Data Commons (GDC)

In this notebook, we obtain the clinical data for multiple projects from Chicagoland Pandemic Response Commons and NCI Genomic Data Commons (GDC) and visualize some of the clinical attributes. This notebook is for demonstration purposes only and not biologically meaningful.

Import libraries

In [1]:
%%capture
import requests
import json
import pandas as pd
import requests, json, fnmatch, os, os.path, sys, subprocess, glob, ntpath, copy, gen3
from gen3.auth import Gen3Auth
from gen3.submission import Gen3Submission
from functools import reduce
# visual libraries
! pip install upsetplot
! pip install joypy
import upsetplot
from upsetplot import UpSet
import warnings
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import joypy
import numpy as np
# notebook setting
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

Use GDC API to extract clinical data for TCGA-COAD

In [2]:
def get_gdc_clinical(project: str):
    gdc_url = "https://api.gdc.cancer.gov/cases"
    headers = {"Content-Type": "application/json"}
    fields = "case_id,submitter_id,project.project_id,project.name"
    expand = "diagnoses,demographic,family_histories"
    params = {
        "filters": {
            "op": "in",
            "content": {"field": "project.project_id", "value": [project]},
        },
        "format": "TSV",
        "size": "10000",
        "fields": fields,
        "expand": expand,
    }
    response = requests.post(
        gdc_url, data=json.dumps(params), headers=headers, stream=True
    )
    return response.content


# TCGA-OV
project = "TCGA-COAD"
ofil = "{0}.clinical.tsv".format(project)
with open(ofil, "wt") as o:
    o.write(get_gdc_clinical(project).decode("utf-8"))

gdc_df = pd.read_csv(ofil, sep="\t")

Visualize Age at Diagnosis and Year to Death by race in TCGA-COAD

In [3]:
df = gdc_df[
    ["demographic.race", "diagnoses.0.age_at_diagnosis", "demographic.days_to_death"]
]
df["Age at diagnosis"] = df["diagnoses.0.age_at_diagnosis"] / 365
df["Year to death"] = df["demographic.days_to_death"] / 365
df["Race"] = df["demographic.race"]
df = df.dropna(subset=["Age at diagnosis"])
plt.figure(figsize=(8, 6), dpi=200)
fig, axes = joypy.joyplot(
    df.loc[(df.Race != "native hawaiian or other pacific islander") & (df.Race != "asian") & (df.Race != "american indian or alaska native")],
    column=["Age at diagnosis", "Year to death"],
    by="Race",
    ylim="own",
    figsize=(8, 6),
    overlap=1,
    legend=True,
)
plt.title("Distribution of Age at Diagnosis and Year to Death by Race", fontsize=15)
plt.show()
<Figure size 1600x1200 with 0 Axes>

Visualize Tumor Stage and Vital Status in TCGA-COAD

In [4]:
plt.figure(figsize=(10, 8), dpi=200)
vital_figo_df = (
    gdc_df.loc[:, ["case_id", "demographic.vital_status", "diagnoses.0.tumor_stage"]]
    .groupby(["demographic.vital_status", "diagnoses.0.tumor_stage"])
    .size()
    .reset_index(name="counts")
)
vital_figo_df = vital_figo_df.loc[
    vital_figo_df["demographic.vital_status"] != "Not Reported"
]
table = pd.pivot_table(
    vital_figo_df,
    columns=["demographic.vital_status"],
    values=["counts"],
    index="diagnoses.0.tumor_stage",
    aggfunc=np.sum,
    fill_value=0,
)
table["index"] = range(14)
g = sns.catplot(
    x="counts",
    y="diagnoses.0.tumor_stage",
    data=vital_figo_df,
    hue="demographic.vital_status",
    kind="bar",
    height=6,
    aspect=1.5,
    legend=False,
)
for row in table.itertuples():
    g.ax.text(
        row[1] + 6,
        row[3] - 0.1,
        "n=" + str(row[1]),
        horizontalalignment="center",
        size=9,
        color="black",
    )
for row in table.itertuples():
    g.ax.text(
        row[2] + 3,
        row[3] + 0.28,
        "n=" + str(row[2]),
        horizontalalignment="center",
        size=9,
        color="black",
    )
plt.xlabel("Number of Samples")
plt.ylabel("Tumor Stage")
plt.yticks(fontsize=10)
plt.title("Sample Size by Tumor Stage and Vital Status", fontsize=15)
plt.legend()
plt.show()
<Figure size 2000x1600 with 0 Axes>

Use Gen3 SDK to extract clinical data for COVID-19 datasets

To extract the data we need, we simply export the demographic and observation nodes from the Chicagoland Pandemic Response Commons.

In [5]:
api = "https://chicagoland.pandemicresponsecommons.org/"
creds = "credentials_covid.json"
auth = Gen3Auth(api, refresh_file=creds)
sub = Gen3Submission(api, auth)

def get_node_tsvs(node, projects=None, overwrite=False):
    # Get a TSV of the node(s) specified for each project specified
    if not isinstance(node, str):  # Create folder on VM for downloaded files
        mydir = "downloaded_tsvs"
    else:
        mydir = str(node + "_tsvs")
    if not os.path.exists(mydir):
        os.makedirs(mydir)
    if projects is None:  # if no projects specified, get node for all projects
        project_ids = list(
            json_normalize(
                sub.query("""{project (first:0){project_id}}""")["data"]["project"]
            )["project_id"]
        )
    elif isinstance(projects, str):
        projects = [projects]
    dfs = []
    df_len = 0
    for project in projects:
        filename = str(mydir + "/" + project + "_" + node + ".tsv")
        if (os.path.isfile(filename)) and (overwrite is False):
            print("File previously downloaded.")
        else:
            prog, proj = project.split("-", 1)
            sub.export_node(prog, proj, node, "tsv", filename)
        df1 = pd.read_csv(filename, sep="\t", header=0)
        dfs.append(df1)
        df_len += len(df1)
        print(filename + " has " + str(len(df1)) + " records.")
    all_data = pd.concat(dfs, ignore_index=True)
    print("length of all dfs: " + str(df_len))
    nodefile = str("master_" + node + ".tsv")
    all_data.to_csv(str(mydir + "/" + nodefile), sep="\t")
    print(
        "Master node TSV with "
        + str(len(all_data))
        + " total records written to "
        + nodefile
        + "."
    )
    return all_data

# Get subject node tsvs
demographic_DSFSI = get_node_tsvs("demographic", "open-DSFSI")
demographic_nCOV2019 = get_node_tsvs("demographic", "open-nCoV2019")
observation_nCOV2019 = get_node_tsvs("observation", "open-nCoV2019")
File previously downloaded.
demographic_tsvs/open-DSFSI_demographic.tsv has 10087 records.
length of all dfs: 10087
Master node TSV with 10087 total records written to master_demographic.tsv.
File previously downloaded.
demographic_tsvs/open-nCoV2019_demographic.tsv has 1083 records.
length of all dfs: 1083
Master node TSV with 1083 total records written to master_demographic.tsv.
File previously downloaded.
observation_tsvs/open-nCoV2019_observation.tsv has 1083 records.
length of all dfs: 1083
Master node TSV with 1083 total records written to master_observation.tsv.

Harmonize symptoms across nCoV2019 dataset

In [6]:
demographic_nCOV2019 = demographic_nCOV2019[["age", "gender", "subjects.submitter_id"]]
observation_nCOV2019 = observation_nCOV2019[["symptoms", "subjects.submitter_id"]]
nCOV2019 = pd.merge(
    demographic_nCOV2019, observation_nCOV2019, on="subjects.submitter_id", how="inner"
)
ncov2019 = nCOV2019[["age", "symptoms"]].dropna(subset=["symptoms"])
In [7]:
def harmonize_symptoms(dataframe):
    dataframe["symptom"] = dataframe["symptoms"].str.split(",")
    for symptom in set.union(*dataframe.symptom.apply(set)):
        dataframe[symptom] = dataframe.apply(
            lambda _: int(symptom in _.symptom), axis=1
        )
    dataframe = dataframe.drop(["symptoms", "symptom"], axis=1)
    symptoms = dataframe.columns.tolist()
    symptoms.remove("age")
    for symptom in symptoms:
        dataframe = dataframe.replace({symptom: {0: False, 1: True}})
    return dataframe

Visualize the combinations of symptoms in nCoV2019 dataset

In [8]:
ncov2019_symptom = harmonize_symptoms(ncov2019)
symptoms = ncov2019_symptom.columns.tolist()
symptoms.remove("age")
ncov2019_symptom = ncov2019_symptom.set_index(symptoms)
upset = UpSet(
    ncov2019_symptom, subset_size="count", intersection_plot_elements=10, element_size=16
)
upset.add_catplot(value="age", kind="boxen", elements=8)
upset.plot()
plt.show()

Visualize the male percentage by age in two COVID-19 datasets and TCGA-COAD

In [9]:
gdc_gender_percent = gdc_df[
    ["diagnoses.0.age_at_diagnosis", "demographic.gender"]
].dropna(subset=["diagnoses.0.age_at_diagnosis", "demographic.gender"])
gdc_gender_percent["age"] = gdc_gender_percent["diagnoses.0.age_at_diagnosis"] / 365


def define_age_decade(dataset):
    dataset.loc[(dataset["age"] < 10), "age_decade"] = "0s"
    dataset.loc[(dataset["age"] < 20) & (dataset["age"] >= 10), "age_decade"] = "10s"
    dataset.loc[(dataset["age"] < 30) & (dataset["age"] >= 20), "age_decade"] = "20s"
    dataset.loc[(dataset["age"] < 40) & (dataset["age"] >= 30), "age_decade"] = "30s"
    dataset.loc[(dataset["age"] < 50) & (dataset["age"] >= 40), "age_decade"] = "40s"
    dataset.loc[(dataset["age"] < 60) & (dataset["age"] >= 50), "age_decade"] = "50s"
    dataset.loc[(dataset["age"] < 70) & (dataset["age"] >= 60), "age_decade"] = "60s"
    dataset.loc[(dataset["age"] < 80) & (dataset["age"] >= 70), "age_decade"] = "70s"
    dataset.loc[(dataset["age"] < 90) & (dataset["age"] >= 80), "age_decade"] = "80s"
    dataset.loc[(dataset["age"] < 100) & (dataset["age"] >= 90), "age_decade"] = "90s"
    dataset.loc[(dataset["age"] >= 100), "age_decade"] = "100s"
    return dataset[dataset.age_decade.notna()]

Calculate fatality rates

The fatality rate is the number of confirmed deaths divided by the number of confirmed cases.

In [10]:
# Calculate male percentage for TCGA-COAD
gdc_gender_percent = define_age_decade(gdc_gender_percent[["age", "demographic.gender"]])
gdc_male_percent = (
    gdc_gender_percent.loc[gdc_gender_percent["demographic.gender"] == "male"]
    .groupby(["age_decade"])
    .count()
    / gdc_gender_percent.shape[0]
)
gdc_male_percent = gdc_male_percent.rename(
    columns={"age": "TCGA: Colon Adenocarcinoma"}
)
gdc_male_percent = gdc_male_percent.drop(["demographic.gender"], axis=1)

# Calculate male percentage for DSFSI
demographic_DSFSI = demographic_DSFSI.dropna(subset=["age", "gender"])
DSFSI_gender_percent = define_age_decade(demographic_DSFSI[["age", "gender"]])
DSFSI_male_percent = (
    DSFSI_gender_percent.loc[DSFSI_gender_percent["gender"] == "Male"]
    .groupby(["age_decade"])
    .count()
    / DSFSI_gender_percent.shape[0]
)
DSFSI_male_percent = DSFSI_male_percent.rename(
   columns={"age": "COVID-19 in Africa"}
)
DSFSI_male_percent = DSFSI_male_percent.drop(["gender"], axis=1)

# Calculate male percentage for nCoV2019
demographic_nCOV2019 = demographic_nCOV2019.dropna(subset=["age", "gender"])
nCOV2019_gender_percent = define_age_decade(demographic_nCOV2019[["age", "gender"]])
nCOV2019_male_percent = (
    nCOV2019_gender_percent.loc[nCOV2019_gender_percent["gender"] == "Male"]
    .groupby(["age_decade"])
    .count()
    / nCOV2019_gender_percent.shape[0]
)
nCOV2019_male_percent = nCOV2019_male_percent.rename(
   columns={"age": "COVID-19 in nCOV2019"}
)
nCOV2019_male_percent = nCOV2019_male_percent.drop(["gender"], axis=1)
In [11]:
data_frames = [
    DSFSI_male_percent,
    nCOV2019_male_percent,
    gdc_male_percent
]

fatality_all = reduce(
    lambda left, right: pd.merge(left, right, on=["age_decade"], how="outer"),
    data_frames,
)

age_decade = fatality_all.index
fig = go.Figure(
    data=[
        go.Bar(name="COVID-19 in Africa", x=age_decade, y=fatality_all["COVID-19 in Africa"]),
        go.Bar(name="COVID-19 in nCOV2019", x=age_decade, y=fatality_all["COVID-19 in nCOV2019"]),
        go.Bar(name="TCGA: Colon Adenocarcinoma (GDC)", x=age_decade, y=fatality_all['TCGA: Colon Adenocarcinoma']),
    ]
)

fig.update_layout(
    yaxis=dict(tickformat=".0%"),
    barmode="group",
    title="Male Percentage by Age in COVID-19 and TCGA Colon Adenocarcinoma",
    xaxis_title="Age",
    yaxis_title="Male Percentage",
    legend_title="Datasets",
    font=dict(size=15),
)
fig.show()